# Load packages
library(here)
library(tidyverse)
library(gt)
library(e1071)
library(scales)
library(corrplot)
library(caret)
library(randomForest)
library(glmnet)
library(gbm)
# Read in data
data <- read_csv(here("inputs//data_prep.csv"))
data <- data %>%
mutate(room_type = as.factor(room_type), license = as.factor(license))
For this analysis we’re explore the listings data of Airbnb rentals in Toronto. The data can be found at link
We’re primarily interested in the rental price however we’ll explore the entire dataset for anything interesting and visualize the results. The analysis will conclude with a model for predicting rental prices.
We’ve previously removed all listings with no reviews, we also remove all listings with no availability within the next year as these are likely no longer actively being rented and removed NA columns from the data.
We’ll generate some initial summary statistics of the various predictors to get started:
# Check Summary Statistics
summary(data)
## id listing_url scrape_id
## Min. :8.077e+03 Length:9358 Min. :2.023e+13
## 1st Qu.:2.568e+07 Class :character 1st Qu.:2.023e+13
## Median :4.633e+07 Mode :character Median :2.023e+13
## Mean :2.448e+17 Mean :2.023e+13
## 3rd Qu.:6.565e+17 3rd Qu.:2.023e+13
## Max. :8.411e+17 Max. :2.023e+13
##
## last_scraped source name description
## Min. :2023-03-09 Length:9358 Length:9358 Length:9358
## 1st Qu.:2023-03-10 Class :character Class :character Class :character
## Median :2023-03-10 Mode :character Mode :character Mode :character
## Mean :2023-03-09
## 3rd Qu.:2023-03-10
## Max. :2023-03-26
##
## neighborhood_overview picture_url host_id
## Length:9358 Length:9358 Min. : 22795
## Class :character Class :character 1st Qu.: 28909542
## Mode :character Mode :character Median :113615879
## Mean :169614255
## 3rd Qu.:283517431
## Max. :506119636
##
## host_url host_name host_since host_location
## Length:9358 Length:9358 Min. :2009-06-22 Length:9358
## Class :character Class :character 1st Qu.:2015-03-16 Class :character
## Mode :character Mode :character Median :2017-01-28 Mode :character
## Mean :2017-05-09
## 3rd Qu.:2019-08-06
## Max. :2023-03-19
##
## host_about host_response_time host_response_rate host_acceptance_rate
## Length:9358 Length:9358 Length:9358 Length:9358
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## host_is_superhost host_thumbnail_url host_picture_url host_neighbourhood
## Mode :logical Length:9358 Length:9358 Length:9358
## FALSE:5715 Class :character Class :character Class :character
## TRUE :3643 Mode :character Mode :character Mode :character
##
##
##
##
## host_listings_count host_total_listings_count host_verifications
## Min. : 1.000 Min. : 1.00 Length:9358
## 1st Qu.: 1.000 1st Qu.: 1.00 Class :character
## Median : 2.000 Median : 3.00 Mode :character
## Mean : 7.539 Mean : 12.51
## 3rd Qu.: 5.000 3rd Qu.: 8.00
## Max. :513.000 Max. :585.00
##
## host_has_profile_pic host_identity_verified neighbourhood
## Mode :logical Mode :logical Length:9358
## FALSE:139 FALSE:607 Class :character
## TRUE :9219 TRUE :8751 Mode :character
##
##
##
##
## neighbourhood_cleansed neighbourhood_group_cleansed latitude
## Length:9358 Mode:logical Min. :43.59
## Class :character NA's:9358 1st Qu.:43.65
## Mode :character Median :43.67
## Mean :43.68
## 3rd Qu.:43.71
## Max. :43.84
##
## longitude property_type room_type accommodates
## Min. :-79.62 Length:9358 Entire home/apt:6176 Min. : 1.000
## 1st Qu.:-79.43 Class :character Hotel room : 1 1st Qu.: 2.000
## Median :-79.40 Mode :character Private room :3109 Median : 2.000
## Mean :-79.40 Shared room : 72 Mean : 3.201
## 3rd Qu.:-79.37 3rd Qu.: 4.000
## Max. :-79.13 Max. :16.000
##
## bathrooms bathrooms_text bedrooms beds
## Mode:logical Length:9358 Min. :1.000 Min. : 1.000
## NA's:9358 Class :character 1st Qu.:1.000 1st Qu.: 1.000
## Mode :character Median :1.000 Median : 1.000
## Mean :1.487 Mean : 1.783
## 3rd Qu.:2.000 3rd Qu.: 2.000
## Max. :9.000 Max. :11.000
## NA's :88
## amenities price minimum_nights maximum_nights
## Length:9358 Min. : 14.0 Min. : 1.00 Min. : 1.0
## Class :character 1st Qu.: 79.0 1st Qu.: 3.00 1st Qu.: 90.0
## Mode :character Median : 125.0 Median : 28.00 Median : 365.0
## Mean : 184.3 Mean : 24.02 Mean : 557.2
## 3rd Qu.: 206.0 3rd Qu.: 28.00 3rd Qu.: 1125.0
## Max. :51561.0 Max. :1125.00 Max. :10001.0
##
## minimum_minimum_nights maximum_minimum_nights minimum_maximum_nights
## Min. : 1.00 Min. : 1.00 Min. :1.000e+00
## 1st Qu.: 3.00 1st Qu.: 3.00 1st Qu.:3.650e+02
## Median : 28.00 Median : 28.00 Median :1.125e+03
## Mean : 23.73 Mean : 25.25 Mean :6.892e+05
## 3rd Qu.: 28.00 3rd Qu.: 28.00 3rd Qu.:1.125e+03
## Max. :1125.00 Max. :1125.00 Max. :2.147e+09
##
## maximum_maximum_nights minimum_nights_avg_ntm maximum_nights_avg_ntm
## Min. :1.000e+00 Min. : 1.00 Min. :1.000e+00
## 1st Qu.:3.650e+02 1st Qu.: 3.00 1st Qu.:3.650e+02
## Median :1.125e+03 Median : 28.00 Median :1.125e+03
## Mean :6.892e+05 Mean : 24.41 Mean :6.892e+05
## 3rd Qu.:1.125e+03 3rd Qu.: 28.00 3rd Qu.:1.125e+03
## Max. :2.147e+09 Max. :1125.00 Max. :2.147e+09
##
## calendar_updated has_availability availability_30 availability_60
## Mode:logical Mode :logical Min. : 0.00 Min. : 0.00
## NA's:9358 FALSE:2 1st Qu.: 1.00 1st Qu.: 8.00
## TRUE :9356 Median :10.00 Median :32.00
## Mean :13.32 Mean :30.85
## 3rd Qu.:27.00 3rd Qu.:54.00
## Max. :30.00 Max. :60.00
##
## availability_90 availability_365 calendar_last_scraped number_of_reviews
## Min. : 0.00 Min. : 1.0 Min. :2023-03-09 Min. : 1.00
## 1st Qu.:22.00 1st Qu.: 78.0 1st Qu.:2023-03-10 1st Qu.: 3.00
## Median :52.00 Median :172.0 Median :2023-03-10 Median : 12.00
## Mean :49.89 Mean :183.9 Mean :2023-03-09 Mean : 36.03
## 3rd Qu.:83.00 3rd Qu.:304.0 3rd Qu.:2023-03-10 3rd Qu.: 41.00
## Max. :90.00 Max. :365.0 Max. :2023-03-26 Max. :798.00
##
## number_of_reviews_ltm number_of_reviews_l30d first_review
## Min. : 0.00 Min. : 0.0000 Min. :2009-08-20
## 1st Qu.: 1.00 1st Qu.: 0.0000 1st Qu.:2018-09-10
## Median : 4.00 Median : 0.0000 Median :2021-07-22
## Mean : 11.14 Mean : 0.7477 Mean :2020-06-28
## 3rd Qu.: 14.00 3rd Qu.: 1.0000 3rd Qu.:2022-09-01
## Max. :155.00 Max. :17.0000 Max. :2023-03-09
##
## last_review review_scores_rating review_scores_accuracy
## Min. :2010-08-11 Min. :0.000 Min. :1.000
## 1st Qu.:2022-08-28 1st Qu.:4.700 1st Qu.:4.760
## Median :2022-12-29 Median :4.880 Median :4.920
## Mean :2022-07-02 Mean :4.747 Mean :4.791
## 3rd Qu.:2023-02-19 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :2023-03-21 Max. :5.000 Max. :5.000
## NA's :26
## review_scores_cleanliness review_scores_checkin review_scores_communication
## Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.670 1st Qu.:4.830 1st Qu.:4.860
## Median :4.870 Median :4.960 Median :4.980
## Mean :4.728 Mean :4.839 Mean :4.846
## 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:5.000
## Max. :5.000 Max. :5.000 Max. :5.000
## NA's :26 NA's :26 NA's :26
## review_scores_location review_scores_value license
## Min. :1.00 Min. :1.000 Exempt : 182
## 1st Qu.:4.77 1st Qu.:4.620 Approved by government: 32
## Median :4.93 Median :4.800 STR-2104-FGWRVB : 28
## Mean :4.82 Mean :4.682 STR-2009-HZPDPM : 10
## 3rd Qu.:5.00 3rd Qu.:4.950 STR-2010-GQBDPG : 10
## Max. :5.00 Max. :5.000 (Other) :5319
## NA's :26 NA's :26 NA's :3777
## instant_bookable calculated_host_listings_count
## Mode :logical Min. : 1.000
## FALSE:7568 1st Qu.: 1.000
## TRUE :1790 Median : 2.000
## Mean : 5.094
## 3rd Qu.: 4.000
## Max. :141.000
##
## calculated_host_listings_count_entire_homes
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 3.407
## 3rd Qu.: 2.000
## Max. :141.000
##
## calculated_host_listings_count_private_rooms
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 0.000
## Mean : 1.634
## 3rd Qu.: 2.000
## Max. :29.000
##
## calculated_host_listings_count_shared_rooms reviews_per_month
## Min. : 0.00000 Min. : 0.010
## 1st Qu.: 0.00000 1st Qu.: 0.300
## Median : 0.00000 Median : 0.770
## Mean : 0.05076 Mean : 1.412
## 3rd Qu.: 0.00000 3rd Qu.: 2.000
## Max. :14.00000 Max. :13.200
##
Some initial things to note from the summary statistics is that the vast majority of listings(67.7%) are entire homes or apartments as opposed to shared living spaces. Private rooms make up the bulk of the remainder at 31.5%.
data %>%
select(room_type) %>%
group_by(room_type) %>%
summarize(listings = n()) %>%
ungroup() %>%
mutate(room_type = fct_reorder(room_type, listings)) %>%
ggplot(aes(x = room_type, y = listings, fill = room_type)) +
geom_bar(stat = "identity") +
theme(legend.position = "none") +
xlab("Room Type") +
ylab("Listings")
Like any other residential property the neighbourhood is a likely predictor of the price of the property being rented so next be look at the distribution of
Lets now look at the distribution of rental prices:
ggplot(data, aes(x = price)) +
geom_histogram(binwidth = 10, fill = "blue", color = "black") +
labs(title = "Distribution of Airbnb Rental Prices",
x = "Price",
y = "Count")
I’ve identified the major outlier in this case to be this listing. To get a better idea we can check for example how many listings are there above a price of $5000?
dim(data %>%
filter(price > 5000))[1]
## [1] 3
We find there are only 13 listings. We can then look at a boxplot of the remaining listings after removing the ones above 5000.
data %>%
filter(price < 5000) %>%
ggplot(aes(x = price)) +
geom_boxplot(color = "blue", outlier.color = "red", outlier.size = 2) +
scale_x_continuous(breaks = c(0,1000,2000, 3000, 4000, 5000)) +
ylim(-4,4) +
stat_boxplot(geom ='errorbar') +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank())
The data seems to get much more spread out above a price of $1500, so we’ll focus in on those data points:
data %>%
filter(price < 1500) %>%
ggplot(aes(x = price)) +
geom_boxplot(color = "blue", outlier.color = "red", outlier.size = 2) +
scale_x_continuous(breaks = seq(from = 0, to = 1500, by = 250)) +
ylim(-4,4) +
stat_boxplot(geom ='errorbar') +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank())
This seems like a better representation of most of the price data for the listings.
As with any property value we may expect things like the number of bedrooms in the listing to be a predictor of price. Larger properties should have higher bedroom counts and thus be more expensive to rent:
data %>%
filter(price < 1500) %>%
mutate(bedrooms = as.factor(bedrooms)) %>%
select(price, bedrooms) %>%
group_by(bedrooms) %>%
summarize(mean_price = mean(price)) %>%
ungroup() %>%
ggplot(aes(x = bedrooms, y = mean_price, fill = bedrooms)) +
geom_bar(stat = "identity") +
scale_y_continuous(breaks = seq(from = 0, to = 1500, by = 250)) +
theme(legend.position = "none") +
xlab("# of Bedrooms") +
ylab("Mean Price")
There appears to be only 1 listing with 9 bedrooms.
data %>%
filter(price < 1500 & bedrooms == 9)
Because the price is so wildly off from the overall trend I think it’s best to remove this point in the model.
Next it seems likely review scores should be a strong predictor of prices:
data %>%
filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
ggplot(aes(x = review_scores_rating, y = price)) +
geom_point(alpha = 0.5, color = "#43a2ca") +
geom_smooth(method = "loess", se = FALSE, color = "red") +
labs(title = "Price vs. Review Scores Rating",
x = "Review Scores",
y = "Price")
We can similarly do some quick visualizations to see the relationships among other predictors and the price variable:
data %>%
filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
ggplot(aes(x = minimum_nights)) + geom_histogram(binwidth = 10, fill = "blue", color = "black") +
labs(title = "Distribution of Minimun Nights",
x = "Nights",
y = "Count")
Clearly we have a pretty wide tail to the right on this distribution, suggesting that there are again a few extreme outliers in this data.
data %>%
filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
select(minimum_nights) %>%
group_by(minimum_nights) %>%
summarize(n = n()) %>%
arrange(desc(n))
data %>%
filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & minimum_nights > 1000)
Interestingly by far the most common minimum nights selection on AirBNB is full month stays. Followed by 1,2,3 nights. The outliers we’re seeing in minimum nights largely seem to be inactive listings, with last reviews here for example being from 2016. We would like to remove listings like this but instead of making a cut off for minimum nights we will seek to do this by last review date. This ensures that the pricing we’re seeing in the data is currently active listings.
data %>%
filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
ggplot(aes(x = last_review)) + geom_histogram(binwidth = 10, fill = "blue", color = "black") +
labs(title = "Distribution of Latest Review Date",
x = "Last Review Date",
y = "Count")
As we can see the vast majority of these listings are recent, however we are getting a stretch of listings dating back all the way to 2015. For the sake of ensuring the pricing is accurate especially considering the price disturbances caused during Covid, we’ll remove listings who have not received a review after 2020.
data %>%
filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01')